library(AppliedPredictiveModeling)
library(tidyverse)
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 2.2.1     v purrr   0.2.4
v tibble  1.4.2     v dplyr   0.7.5
v tidyr   0.8.1     v stringr 1.3.1
v readr   1.1.1     v forcats 0.3.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(caret)
Loading required package: lattice

Attaching package: 㤼㸱caret㤼㸲

The following object is masked from 㤼㸱package:purrr㤼㸲:

    lift

Compute

data(segmentationOriginal)
segmentationOriginal <- as_tibble(segmentationOriginal)
segmentationOriginal
seg_data <- subset(segmentationOriginal, Case == "Train")
seg_data
cell_id <- seg_data$Case
class <- seg_data$Class
case <- seg_data$Case
seg_data <- seg_data[, -(1:3)]
seg_data %>% select(-contains("Status")) -> seg_data
seg_data

Skewness

library(e1071)
skewness(seg_data$AngleCh1)
[1] -0.02426252
#seg_data %>% map_dfr(skewness)
summarize_all(seg_data, skewness)

Box-Cox transform

Ch1AreaTrans <- BoxCoxTrans(seg_data$AreaCh1)
Ch1AreaTrans
Box-Cox Transformation

1009 data points used to estimate Lambda

Input data summary:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  150.0   194.0   256.0   325.1   376.0  2186.0 

Largest/Smallest: 14.6 
Sample Skewness: 3.53 

Estimated Lambda: -0.9 

Apply the transform with the predict function

predict(Ch1AreaTrans, head(seg_data$AreaCh1)) -> dat
dat
[1] 1.108458 1.106383 1.104520 1.103554 1.103607 1.105523

Or perform it all at once via caret::preProcess

percent_variance[1:3]
[1] 20.91236 17.01330 11.88689

Near zero variance

nearZeroVar(seg_data)
integer(0)

Correlations

correlations <- cor(seg_data)
dim(correlations)
[1] 58 58
correlations[1:4, 1:4]
                AngleCh1      AreaCh1 AvgIntenCh1 AvgIntenCh2
AngleCh1     1.000000000 -0.002627172 -0.04300776 -0.01944681
AreaCh1     -0.002627172  1.000000000 -0.02529739 -0.15330301
AvgIntenCh1 -0.043007757 -0.025297394  1.00000000  0.52521711
AvgIntenCh2 -0.019446810 -0.153303007  0.52521711  1.00000000

Exercises

LS0tCnRpdGxlOiAiQ2hhcHRlciAzIC0gRGF0YSBQcmUtcHJvY2Vzc2luZyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyIHNldHVwfQpsaWJyYXJ5KEFwcGxpZWRQcmVkaWN0aXZlTW9kZWxpbmcpICAgIyBEYXRhIHNldHMKbGlicmFyeSh0aWR5dmVyc2UpICAgICAgICAgICAgICAgICAgICMgT3BwYW4gdGlkeSBzdHlsZQpsaWJyYXJ5KGNhcmV0KSAgICAgICAgICAgICAgICAgICAgICAgIyBNb2RlbGluZwpsaWJyYXJ5KGUxMDcxKSAgICAgICAgICAgICAgICAgICAgICAgIyBza2V3bmVzcwpgYGAKCiMjIENvbXB1dGUKCmBgYHtyfQpkYXRhKHNlZ21lbnRhdGlvbk9yaWdpbmFsKQpzZWdtZW50YXRpb25PcmlnaW5hbCA8LSBhc190aWJibGUoc2VnbWVudGF0aW9uT3JpZ2luYWwpCnNlZ21lbnRhdGlvbk9yaWdpbmFsCmBgYAoKYGBge3J9CnNlZ19kYXRhIDwtIHN1YnNldChzZWdtZW50YXRpb25PcmlnaW5hbCwgQ2FzZSA9PSAiVHJhaW4iKQpzZWdfZGF0YQpgYGAKCmBgYHtyfQpjZWxsX2lkIDwtIHNlZ19kYXRhJENhc2UKY2xhc3MgPC0gc2VnX2RhdGEkQ2xhc3MKY2FzZSA8LSBzZWdfZGF0YSRDYXNlCnNlZ19kYXRhIDwtIHNlZ19kYXRhWywgLSgxOjMpXQpzZWdfZGF0YSAlPiUgc2VsZWN0KC1jb250YWlucygiU3RhdHVzIikpIC0+IHNlZ19kYXRhCnNlZ19kYXRhCmBgYAoKU2tld25lc3MKCmBgYHtyfQpza2V3bmVzcyhzZWdfZGF0YSRBbmdsZUNoMSkKI3NlZ19kYXRhICU+JSBtYXBfZGZyKHNrZXduZXNzKQpzdW1tYXJpemVfYWxsKHNlZ19kYXRhLCBza2V3bmVzcykKYGBgCgpCb3gtQ294IHRyYW5zZm9ybQpgYGB7cn0KQ2gxQXJlYVRyYW5zIDwtIEJveENveFRyYW5zKHNlZ19kYXRhJEFyZWFDaDEpCkNoMUFyZWFUcmFucwpgYGAKCkFwcGx5IHRoZSB0cmFuc2Zvcm0gd2l0aCB0aGUgYHByZWRpY3RgIGZ1bmN0aW9uCgpgYGB7cn0KcHJlZGljdChDaDFBcmVhVHJhbnMsIGhlYWQoc2VnX2RhdGEkQXJlYUNoMSkpIC0+IGRhdApkYXQKYGBgCgpPciBwZXJmb3JtIGl0IGFsbCBhdCBvbmNlIHZpYSBgY2FyZXQ6OnByZVByb2Nlc3NgCgpgYGB7cn0KcGNhX29iamVjdCA8LSBwcmNvbXAoc2VnX2RhdGEsIGNlbnRlciA9IFRSVUUsIHNjYWxlID0gVFJVRSkKcGVyY2VudF92YXJpYW5jZSA8LSBwY2Ffb2JqZWN0JHNkZXZeMi9zdW0ocGNhX29iamVjdCRzZGV2XjIpKjEwMApwZXJjZW50X3ZhcmlhbmNlWzE6M10KYGBgCgpOZWFyIHplcm8gdmFyaWFuY2UKYGBge3J9Cm5lYXJaZXJvVmFyKHNlZ19kYXRhKQpgYGAKCkNvcnJlbGF0aW9ucwoKYGBge3J9CmNvcnJlbGF0aW9ucyA8LSBjb3Ioc2VnX2RhdGEpCmRpbShjb3JyZWxhdGlvbnMpCmNvcnJlbGF0aW9uc1sxOjQsIDE6NF0KYGBgCgpgYGB7cn0KY29ycnBsb3Q6OmNvcnJwbG90KGNvcnJlbGF0aW9ucywgb3JkZXIgPSAiaGNsdXN0IikgCmBgYAoKIyMgRXhlcmNpc2VzCg==